import pickle
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
%matplotlib inline
import pandas as pd
import os, glob
import seaborn as sns
sns.set(style="darkgrid")
from IPython.display import display, HTML, Image
import plotly
import plotly.express as px
import plotly.graph_objects as go
data5H =pd.read_csv('15_PCc_02_LHS_500_54854_01_s1_G.csv')
data5K =pd.read_csv('15_PCc_02_LHS_5000_54854_01_s1_G.csv')
#data50K =pd.read_csv('15_PCc_02_LHS_50000_54854_01_s1_G.csv')
#data5k =pd.read_csv(r'F:\ML4ChemcialReactions\chemical5K\10_PC_02_LHS_5000_54854_01_s1_G.csv')
### Remove the space from the column names
data5H.columns =[col.strip() for col in data5H.columns]
data5K.columns =[col.strip() for col in data5K.columns]
#data50K.columns =[col.strip() for col in data50K.columns]
data5H.columns
Index(['T', 'CaO', 'SiO2', 'CO2', 'H2O', 'pH', 'MassWater', 'Ca_aq', 'Si_aq',
'C_aq', 'O_aq', 'H_aq', 'Ca_s', 'Si_s', 'C_s', 'O_s', 'H_s',
'Portlandite', 'AmorfSi', 'Calcite', 'mCSHQ', 'Ca_ss', 'Si_ss',
'H2O_ss', 'V_s', 'Gel_water'],
dtype='object')
#data50K.describe().T
data5K.columns
Index(['T', 'CaO', 'SiO2', 'CO2', 'H2O', 'pH', 'MassWater', 'Ca_aq', 'Si_aq',
'C_aq', 'O_aq', 'H_aq', 'Ca_s', 'Si_s', 'C_s', 'O_s', 'H_s',
'Portlandite', 'AmorfSi', 'Calcite', 'mCSHQ', 'Ca_ss', 'Si_ss',
'H2O_ss', 'V_s', 'Gel_water'],
dtype='object')
for mineral in ['Portlandite', 'AmorfSi', 'Calcite']:
conditions = [
(data5K[mineral]>0),
(data5K[mineral]==0)]
values = [mineral, 'No'+mineral]
data5K[mineral+'_stat'] = np.select(conditions, values)
colStats= [col for col in data5K.columns if '_stat' in col]
data5K['group'] = data5K[colStats].agg('-'.join, axis=1)
data5K['group'].unique()
array(['Portlandite-NoAmorfSi-Calcite', 'NoPortlandite-AmorfSi-Calcite',
'NoPortlandite-NoAmorfSi-Calcite'], dtype=object)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# metrics are used to find accuracy or error
from sklearn import metrics
data5K.columns
Index(['T', 'CaO', 'SiO2', 'CO2', 'H2O', 'pH', 'MassWater', 'Ca_aq', 'Si_aq',
'C_aq', 'O_aq', 'H_aq', 'Ca_s', 'Si_s', 'C_s', 'O_s', 'H_s',
'Portlandite', 'AmorfSi', 'Calcite', 'mCSHQ', 'Ca_ss', 'Si_ss',
'H2O_ss', 'V_s', 'Gel_water', 'Portlandite_stat', 'AmorfSi_stat',
'Calcite_stat', 'group'],
dtype='object')
inputVars= ['CaO', 'SiO2', 'CO2', 'H2O']
targVars = ['group']
X = data5K[inputVars]
y = data5K[targVars]
# i.e. 80 % training dataset and 20 % test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
# creating a RF classifier
clf = RandomForestClassifier(n_estimators = 500)
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)
# performing predictions on the test dataset
y_pred = clf.predict(X_test)
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
C:\Users\Cyang.MDM-MOD1-D1\AppData\Local\Temp\ipykernel_3932\337620559.py:6: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel(). clf.fit(X_train, y_train)
ACCURACY OF THE MODEL: 0.963
X_test =pd.concat([X_test,y_test],axis=1)
X_test['pred']=y_pred
X_test.columns
Index(['CaO', 'SiO2', 'CO2', 'H2O', 'group', 'pred'], dtype='object')
fig =px.scatter_ternary(X_test, a="CaO", b="SiO2", c="CO2",color='group')
fig.show()
fig =px.scatter_ternary(X_test, a="CaO", b="SiO2", c="CO2",color='pred')
fig.show()